#include "ref.h"

void ref_fir_sparse_f32(riscv_fir_sparse_instance_f32 *S, float32_t *pSrc,
                        float32_t *pDst, float32_t *pScratchIn,
                        uint32_t blockSize)
{
    float32_t *pState = S->pState;         /* State pointer */
    const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
    float32_t *px;                         /* Scratch buffer pointer */
    float32_t *py = pState;     /* Temporary pointers for state buffer */
    float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
    float32_t *pOut;            /* Destination pointer */
    int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset
                                          of the non-zero tap values. */
    uint32_t delaySize = S->maxDelay + blockSize; /* state length */
    uint16_t numTaps =
        S->numTaps;          /* Number of filter coefficients in the filter  */
    int32_t readIndex;       /* Read index of the state buffer */
    uint32_t tapCnt, blkCnt; /* loop counters */
    float32_t coeff = *pCoeffs++; /* Read the first coefficient value */

    /* BlockSize of Input samples are copied into the state buffer */
    /* StateIndex points to the starting position to write in the state buffer
     */
    riscv_circularWrite_f32((int32_t *)py, delaySize, &S->stateIndex, 1,
                          (int32_t *)pSrc, 1, blockSize);

    /* Read Index, from where the state buffer should be read, is calculated. */
    readIndex = ((int32_t)S->stateIndex - (int32_t)blockSize) - *pTapDelay++;

    /* Wraparound of readIndex */
    if (readIndex < 0) {
        readIndex += (int32_t)delaySize;
    }

    /* Working pointer for state buffer is updated */
    py = pState;

    /* blockSize samples are read from the state buffer */
    riscv_circularRead_f32((int32_t *)py, delaySize, &readIndex, 1, (int32_t *)pb,
                         (int32_t *)pb, blockSize, 1, blockSize);

    /* Working pointer for the scratch buffer */
    px = pb;

    /* Working pointer for destination buffer */
    pOut = pDst;

    blkCnt = blockSize;

    while (blkCnt > 0U) {
        /* Perform Multiplications and store in destination buffer */
        *pOut++ = *px++ * coeff;

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* Loop over the number of taps. */
    tapCnt = (uint32_t)numTaps - 1U;

    while (tapCnt > 0U) {
        /* Load the coefficient value and
         * increment the coefficient buffer for the next set of state values */
        coeff = *pCoeffs++;

        /* Read Index, from where the state buffer should be read, is
         * calculated. */
        readIndex =
            ((int32_t)S->stateIndex - (int32_t)blockSize) - *pTapDelay++;

        /* Wraparound of readIndex */
        if (readIndex < 0) {
            readIndex += (int32_t)delaySize;
        }

        /* Working pointer for state buffer is updated */
        py = pState;

        /* blockSize samples are read from the state buffer */
        riscv_circularRead_f32((int32_t *)py, delaySize, &readIndex, 1,
                             (int32_t *)pb, (int32_t *)pb, blockSize, 1,
                             blockSize);

        /* Working pointer for the scratch buffer */
        px = pb;

        /* Working pointer for destination buffer */
        pOut = pDst;

        blkCnt = blockSize;

        while (blkCnt > 0U) {
            /* Perform Multiply-Accumulate */
            *pOut++ += *px++ * coeff;

            /* Decrement the loop counter */
            blkCnt--;
        }

        /* Decrement the tap loop counter */
        tapCnt--;
    }
}

void ref_fir_sparse_q31(riscv_fir_sparse_instance_q31 *S, q31_t *pSrc,
                        q31_t *pDst, q31_t *pScratchIn, uint32_t blockSize)
{
    q31_t *pState = S->pState;         /* State pointer */
    const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
    q31_t *px;                         /* Scratch buffer pointer */
    q31_t *py = pState;                /* Temporary pointers for state buffer */
    q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
    q31_t *pOut;            /* Destination pointer */
    q63_t out;              /* Temporary output variable */
    int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset
                                          of the non-zero tap values. */
    uint32_t delaySize = S->maxDelay + blockSize; /* state length */
    uint16_t numTaps = S->numTaps;                /* Filter order */
    int32_t readIndex;        /* Read index of the state buffer */
    uint32_t tapCnt, blkCnt;  /* loop counters */
    q31_t coeff = *pCoeffs++; /* Read the first coefficient value */
    q31_t in;

    /* BlockSize of Input samples are copied into the state buffer */
    /* StateIndex points to the starting position to write in the state buffer
     */
    riscv_circularWrite_f32((int32_t *)py, delaySize, &S->stateIndex, 1,
                          (int32_t *)pSrc, 1, blockSize);

    /* Read Index, from where the state buffer should be read, is calculated. */
    readIndex = (int32_t)(S->stateIndex - blockSize) - *pTapDelay++;

    /* Wraparound of readIndex */
    if (readIndex < 0) {
        readIndex += (int32_t)delaySize;
    }

    /* Working pointer for state buffer is updated */
    py = pState;

    /* blockSize samples are read from the state buffer */
    riscv_circularRead_f32((int32_t *)py, delaySize, &readIndex, 1, (int32_t *)pb,
                         (int32_t *)pb, blockSize, 1, blockSize);

    /* Working pointer for the scratch buffer of state values */
    px = pb;

    /* Working pointer for scratch buffer of output values */
    pOut = pDst;

    blkCnt = blockSize;

    while (blkCnt > 0U) {
        /* Perform Multiplications and store in the destination buffer */
        *pOut++ = (q31_t)(((q63_t)*px++ * coeff) >> 32);

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* Loop over the number of taps. */
    tapCnt = (uint32_t)numTaps - 1U;

    while (tapCnt > 0U) {
        /* Load the coefficient value and
         * increment the coefficient buffer for the next set of state values */
        coeff = *pCoeffs++;

        /* Read Index, from where the state buffer should be read, is
         * calculated. */
        readIndex = (int32_t)(S->stateIndex - blockSize) - *pTapDelay++;

        /* Wraparound of readIndex */
        if (readIndex < 0) {
            readIndex += (int32_t)delaySize;
        }

        /* Working pointer for state buffer is updated */
        py = pState;

        /* blockSize samples are read from the state buffer */
        riscv_circularRead_f32((int32_t *)py, delaySize, &readIndex, 1,
                             (int32_t *)pb, (int32_t *)pb, blockSize, 1,
                             blockSize);

        /* Working pointer for the scratch buffer of state values */
        px = pb;

        /* Working pointer for scratch buffer of output values */
        pOut = pDst;

        blkCnt = blockSize;

        while (blkCnt > 0U) {
            /* Perform Multiply-Accumulate */
            out = *pOut;
            out += ((q63_t)*px++ * coeff) >> 32;
            *pOut++ = (q31_t)(out);

            /* Decrement the loop counter */
            blkCnt--;
        }

        /* Decrement the tap loop counter */
        tapCnt--;
    }

    /* Working output pointer is updated */
    pOut = pDst;

    /* Output is converted into 1.31 format. */
    blkCnt = blockSize;

    while (blkCnt > 0U) {
        in = *pOut << 1;
        *pOut++ = in;

        /* Decrement the loop counter */
        blkCnt--;
    }
}

void ref_fir_sparse_q15(riscv_fir_sparse_instance_q15 *S, q15_t *pSrc,
                        q15_t *pDst, q15_t *pScratchIn, q31_t *pScratchOut,
                        uint32_t blockSize)
{
    q15_t *pState = S->pState;         /* State pointer */
    const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
    q15_t *pIn = pSrc;                 /* Working pointer for input */
    q15_t *pOut = pDst;                /* Working pointer for output */
    q15_t *px;              /* Temporary pointers for scratch buffer */
    q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
    q15_t *py = pState;     /* Temporary pointers for state buffer */
    int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset
                                          of the non-zero tap values. */
    uint32_t delaySize = S->maxDelay + blockSize; /* state length */
    uint16_t numTaps = S->numTaps;                /* Filter order */
    int32_t readIndex;          /* Read index of the state buffer */
    uint32_t tapCnt, blkCnt;    /* loop counters */
    q15_t coeff = *pCoeffs++;   /* Read the first coefficient value */
    q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */

    /* BlockSize of Input samples are copied into the state buffer */
    /* StateIndex points to the starting position to write in the state buffer
     */
    riscv_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);

    /* Loop over the number of taps. */
    tapCnt = numTaps;

    /* Read Index, from where the state buffer should be read, is calculated. */
    readIndex = (S->stateIndex - blockSize) - *pTapDelay++;

    /* Wraparound of readIndex */
    if (readIndex < 0) {
        readIndex += (int32_t)delaySize;
    }

    /* Working pointer for state buffer is updated */
    py = pState;

    /* blockSize samples are read from the state buffer */
    riscv_circularRead_q15(py, delaySize, &readIndex, 1, pb, pb, blockSize, 1,
                         blockSize);

    /* Working pointer for the scratch buffer of state values */
    px = pb;

    /* Working pointer for scratch buffer of output values */
    pScratchOut = pScr2;

    blkCnt = blockSize;

    while (blkCnt > 0U) {
        /* Perform multiplication and store in the scratch buffer */
        *pScratchOut++ = ((q31_t)*px++ * coeff);

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* Loop over the number of taps. */
    tapCnt = (uint32_t)numTaps - 1U;

    while (tapCnt > 0U) {
        /* Load the coefficient value and
         * increment the coefficient buffer for the next set of state values */
        coeff = *pCoeffs++;

        /* Read Index, from where the state buffer should be read, is
         * calculated. */
        readIndex = (S->stateIndex - blockSize) - *pTapDelay++;

        /* Wraparound of readIndex */
        if (readIndex < 0) {
            readIndex += (int32_t)delaySize;
        }

        /* Working pointer for state buffer is updated */
        py = pState;

        /* blockSize samples are read from the state buffer */
        riscv_circularRead_q15(py, delaySize, &readIndex, 1, pb, pb, blockSize, 1,
                             blockSize);

        /* Working pointer for the scratch buffer of state values */
        px = pb;

        /* Working pointer for scratch buffer of output values */
        pScratchOut = pScr2;

        blkCnt = blockSize;

        while (blkCnt > 0U) {
            /* Perform Multiply-Accumulate */
            *pScratchOut++ += (q31_t)*px++ * coeff;

            /* Decrement the loop counter */
            blkCnt--;
        }

        /* Decrement the tap loop counter */
        tapCnt--;
    }

    /* All the output values are in pScratchOut buffer.
       Convert them into 1.15 format, saturate and store in the destination
       buffer. */
    /* Loop over the blockSize. */
    blkCnt = blockSize;

    while (blkCnt > 0U) {
        *pOut++ = (q15_t)__SSAT(*pScr2++ >> 15, 16);
        blkCnt--;
    }
}

void ref_fir_sparse_q7(riscv_fir_sparse_instance_q7 *S, q7_t *pSrc, q7_t *pDst,
                       q7_t *pScratchIn, q31_t *pScratchOut, uint32_t blockSize)
{
    q7_t *pState = S->pState;         /* State pointer */
    const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
    q7_t *px;                         /* Scratch buffer pointer */
    q7_t *py = pState;                /* Temporary pointers for state buffer */
    q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
    q7_t *pOut = pDst;     /* Destination pointer */
    int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset
                                          of the non-zero tap values. */
    uint32_t delaySize = S->maxDelay + blockSize; /* state length */
    uint16_t numTaps = S->numTaps;                /* Filter order */
    int32_t readIndex;       /* Read index of the state buffer */
    uint32_t tapCnt, blkCnt; /* loop counters */
    q7_t coeff = *pCoeffs++; /* Read the coefficient value */
    q31_t *pScr2 =
        pScratchOut; /* Working pointer for scratch buffer of output values */
    q31_t in;

    /* BlockSize of Input samples are copied into the state buffer */
    /* StateIndex points to the starting position to write in the state buffer
     */
    riscv_circularWrite_q7(py, (int32_t)delaySize, &S->stateIndex, 1, pSrc, 1,
                         blockSize);

    /* Loop over the number of taps. */
    tapCnt = numTaps;

    /* Read Index, from where the state buffer should be read, is calculated. */
    readIndex = ((int32_t)S->stateIndex - (int32_t)blockSize) - *pTapDelay++;

    /* Wraparound of readIndex */
    if (readIndex < 0) {
        readIndex += (int32_t)delaySize;
    }

    /* Working pointer for state buffer is updated */
    py = pState;

    /* blockSize samples are read from the state buffer */
    riscv_circularRead_q7(py, (int32_t)delaySize, &readIndex, 1, pb, pb,
                        (int32_t)blockSize, 1, blockSize);

    /* Working pointer for the scratch buffer of state values */
    px = pb;

    /* Working pointer for scratch buffer of output values */
    pScratchOut = pScr2;

    /* Loop over the blockSize */
    blkCnt = blockSize;

    while (blkCnt > 0U) {
        /* Perform multiplication and store in the scratch buffer */
        *pScratchOut++ = ((q31_t)*px++ * coeff);

        /* Decrement the loop counter */
        blkCnt--;
    }

    /* Loop over the number of taps. */
    tapCnt = (uint32_t)numTaps - 1U;

    while (tapCnt > 0U) {
        /* Load the coefficient value and
         * increment the coefficient buffer for the next set of state values */
        coeff = *pCoeffs++;

        /* Read Index, from where the state buffer should be read, is
         * calculated. */
        readIndex =
            ((int32_t)S->stateIndex - (int32_t)blockSize) - *pTapDelay++;

        /* Wraparound of readIndex */
        if (readIndex < 0) {
            readIndex += (int32_t)delaySize;
        }

        /* Working pointer for state buffer is updated */
        py = pState;

        /* blockSize samples are read from the state buffer */
        riscv_circularRead_q7(py, (int32_t)delaySize, &readIndex, 1, pb, pb,
                            (int32_t)blockSize, 1, blockSize);

        /* Working pointer for the scratch buffer of state values */
        px = pb;

        /* Working pointer for scratch buffer of output values */
        pScratchOut = pScr2;

        /* Loop over the blockSize */
        blkCnt = blockSize;

        while (blkCnt > 0U) {
            /* Perform Multiply-Accumulate */
            in = *pScratchOut + ((q31_t)*px++ * coeff);
            *pScratchOut++ = in;

            /* Decrement the loop counter */
            blkCnt--;
        }

        /* Decrement the tap loop counter */
        tapCnt--;
    }

    /* All the output values are in pScratchOut buffer.
       Convert them into 1.15 format, saturate and store in the destination
       buffer. */
    /* Loop over the blockSize. */
    blkCnt = blockSize;

    while (blkCnt > 0U) {
        *pOut++ = (q7_t)__SSAT(*pScr2++ >> 7, 8);

        /* Decrement the blockSize loop counter */
        blkCnt--;
    }
}
